
global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/10_prep_beaind_regressions.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {

* 10_prep_beaind_regressions.do
global cvers _ipc4

* -----------------------------------------------
* Merge the three patent count data, collapse to decade
* ----------------------------------------------

* go over making and using industries
foreach dtvers in use mk { 

    use ${alm_data_proc}/patents_beaind_`dtvers'${cvers}.dta, clear

    *format by using/making industry and clean a bit
    drop if missing(beaind_`dtvers')
    cap destring(beaind_`dtvers'), replace
    gen beaind = string(beaind_`dtvers',"%02.0f")
    drop beaind_`dtvers'
    ds appln_year beaind, not
    local r: di r(varlist)
    foreach v of local r { 
        ren `v' `v'_`dtvers'
    }
    tempfile patdt_`dtvers'
    save `patdt_`dtvers'', replace
}

*merge together
use `patdt_use', clear
gen in_use = 1
mmerge appln_year beaind using `patdt_mk', unmatched(both)
gen in_mk = ( _m == 3 | _m == 2)
drop _m
sort beaind appln_year 

*sum up patent counts by relevant interval and beaind code
drop if appln_year < 1980 | appln_year >1998 
drop appln_year

*sum up patent counts by using/making beaind code
ds beaind in_use in_mk, not
local r: di r(varlist)
collapse  (sum) `r', by(beaind in_use in_mk)

* assign the counts to the end year 1998
gen year = 98
destring(beaind), replace

tempfile ind_patcounts_8098
save `ind_patcounts_8098', replace

* Create the patent variables (share automation), 
foreach dtvers in use mk { 
    foreach tech in auto90 auto95 pauto90 pauto95 {  
            gen sh_`tech'_`dtvers' = (`tech'_`dtvers' /in_relevant_field_`dtvers')
            replace sh_`tech'_`dtvers' = 0 if (in_relevant_field_`dtvers' == 0 | `tech'_`dtvers' == 0)
        }
    ren in_relevant_field_b_`dtvers' in_relevant_field_bia_`dtvers'
    replace in_`dtvers' = 1 if in_`dtvers' > 0 
}

*aggregate patents by beaind code
bys beaind: egen tot_machinery_patents = sum(in_relevant_field_use + in_relevant_field_mk)
log using ${numb_dir}/beaind_industries_without_patents${cvers}.log, replace name(num)
preserve 
duplicates drop beaind, force
list beaind if tot_machinery_patents == 0
restore
cap log close num
keep year in_* beaind *auto* in_relevant_field* 
save ${alm_data_proc}/beaind_patents_data${cvers}.dta, replace

* ------------------------------------------------------------
* Prep and aggr. ALM data (for employment and computerization)
* ------------------------------------------------------------
*Autor, David H., Frank Levy, and Richard J. Murnane. 2003. "The Skill Content of Recent Technological Change:
*An Empirical Exploration." Quarterly Journal of Economics 118 (4): 1279-1334. Accessed November 2020. 
*https://economics.mit.edu/people/faculty/david-h-autor/data-archive
use ${alm_data_raw}/alm/inddot77means6098-centiles-cen60basis.dta, clear 
keep if year == 80 | year == 98
order ind6090 nipa6090 year yrtext edcat sex 
sort ind6090 year edcat
keep if sex == 0
egen count=count(1*(sex==0)), by(ind6090)
drop if count<2

*Reorder so that we get correct 1st diffs
recode year 80=88

* Create the computer use variables over time
gen tag = (use84==. | use89==. | use93==. | use97==.)
drop if use84==. | use89==. | use93==. | use97==.
gen computeruse=10*(use97-use84)/13

* Create employment counts for all ed categories
egen emp_hsd = total(lswt) if edcat == 1, by(ind6090 year)
egen emp_hsg = total(lswt) if edcat == 2, by(ind6090 year)
egen emp_smc = total(lswt) if edcat == 3, by(ind6090 year)
egen emp_clg = total(lswt) if edcat == 4, by(ind6090 year)
replace emp_hsd = emp_hsd[_n+1]
replace emp_hsg = emp_hsg[_n+2]
replace emp_smc = emp_smc[_n+3]
replace emp_clg = emp_clg[_n+4]
egen hskilled = total(lswt) if edcat == 4, by(ind6090 year)
egen lskilled = total(lswt) if edcat == 1 | edcat == 2 | edcat == 3, by(ind6090 year)
replace hskilled = hskilled[_n+4] if edcat == 0
replace lskilled = lskilled[_n+1] if edcat == 0
drop if edcat ~= 0
gen emp = lswt

* aggregate to beaind
keep year computeruse ind6090* emp* mqmath mqdcp mqsts mqfing mqehf hskilled lskilled
mmerge ind6090 using ${alm_data_proc}/cw_ind6090_beaind.dta, unmatched(both)
tab _m 
keep if _m == 3

collapse (sum) emp* hskilled lskilled (mean) computeruse mqmath mqdcp mqsts mqfing mqehf [aw=emp],  by(beaind year)

* Dot variables scaled 0 to 10
sort beaind year
quietly by beaind: gen dqmath   =mqmath-mqmath[_n-1]
quietly by beaind: gen dqdcp    =mqdcp-mqdcp[_n-1]
quietly by beaind: gen dqsts    =mqsts-mqsts[_n-1]
quietly by beaind: gen dqfing   =mqfinger-mqfinger[_n-1]
quietly by beaind: gen dqehf    =mqehf-mqehf[_n-1]

* 10 times annualized changes (of 18 years)
for var dqmath dqdcp dqsts dqfing dqehf: replace X=10*(X/(18))

* H/L-ratio
gen ratioHL = hskilled/lskilled 
replace ratioHL = 0 if hskilled == . 
replace ratioHL = 1 if lskilled == . 
by beaind: gen dratioHL = ratioHL-ratioHL[_n-1]

* And the log employment count differences
foreach var in lskilled hskilled emp_hsd emp_hsg emp_smc emp_clg { 
    gen log_`var' = log(`var')
    by beaind: gen dlog_`var' = log_`var' - log_`var'[_n-1]
}

* Rescale weights to sum to 1 in each year; and create  an average weight for the first difference regressions
egen totwt=sum(emp),by(year) 
gen emp_sh =emp/totwt
sort beaind year
by beaind: gen avwt=(emp_sh + emp_sh[_n-1])/2
clonevar emp_mid_weight_alm = avwt 
* properly scale employment and do emp_change in logs
by beaind: gen emp_midpoint =(emp + emp[_n-1])/2
replace emp_midpoint = emp_midpoint / 1000
gen emp_initial = emp[_n-1] / 1000
gen emp_end = emp / 1000
clonevar emp_init_alm = emp_initial
clonevar emp_mid_alm = emp_midpoint
clonevar emp_end_alm = emp_end
gen log_emp_alm = log(emp_end)
by beaind: gen dlog_emp_alm = log_emp_alm - log_emp_alm[_n-1]

* add manufacturing indicator and cluster-groups
egen uas_beaind = group(beaind)
gen manuf = (inrange(beaind, 20, 39))

*Re-re-order and retain observations that have the correct endpoint
recode year 88=80
keep if year == 98
* Save
keep year computeruse *beaind* emp_initial manuf emp_mid_weight_alm emp_*_alm log_emp_alm dlog_emp_alm avwt computeruse dqmath dqdcp dqsts dqfing dqehf hskilled lskilled dlog_hskilled dlog_lskilled dlog_emp_* dratioHL
save ${final_dir}/beaind_patents_regression_data${cvers}.dta, replace

* ------------------------------------------------------------
* Combine data
* ------------------------------------------------------------

use ${alm_data_proc}/beaind_patents_data${cvers}.dta, clear
mmerge beaind using ${final_dir}/beaind_patents_regression_data${cvers}.dta, unmatched(master)
mmerge beaind using ${alm_data_proc}/beaind_lshare_198097.dta, unmatched(master)
drop _m
sort beaind
drop *_b_* *bia*
sleep 1000
save ${final_dir}/beaind_patents_regression_data${cvers}.dta, replace

}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat